Library calls
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.0 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(readr)
library(patchwork)
knitr::opts_chunk$set(
echo = TRUE,
warning = FALSE,
fig.width = 8,
fig.height = 6,
out.width = "90%")
theme_set(theme_minimal() + theme(legend.position = "bottom"))
options(
ggplot2.continuous.colour = "viridis",
ggplot2.continuous.fill = "viridis")
scale_colour_discrete = scale_colour_viridis_d
scale_fill_discrete = scale_fill_viridis_d
Load and clean NYC rat sighting data:
rat_df = read_csv("Rat_Sightings.csv") %>%
janitor::clean_names() %>%
mutate(created_date = gsub(" .*","", created_date),
address_type = str_to_title(address_type),
city = str_to_title(city),
borough = str_to_title(borough),
location_type = recode(location_type, "Other (Explain Below)" = "Other")) %>%
separate(created_date, into = c("month", "day", "year"), sep = "/") %>%
mutate(year = as.numeric(year),
month = as.numeric(month),
day = as.numeric(day)) %>%
filter(borough != "Unspecified") %>%
select(month, day, year, location_type, address_type, city, borough, latitude, longitude)
## Rows: 207580 Columns: 38
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (25): Created Date, Closed Date, Agency, Agency Name, Complaint Type, De...
## dbl (6): Unique Key, Incident Zip, X Coordinate (State Plane), Y Coordinate...
## lgl (7): Vehicle Type, Taxi Company Borough, Taxi Pick Up Location, Bridge ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Load NYC asthma emergency visit data for adults:
asthma_df = read_csv("asthma_ER_visits_adults.csv") %>%
janitor::clean_names() %>%
rename(
year = time
) %>%
mutate(age_adjusted_rate_per_10_000 = as.numeric(age_adjusted_rate_per_10_000),
estimated_annual_rate_per_10_000 = as.numeric(estimated_annual_rate_per_10_000),
number = as.numeric(number)) %>%
filter(geo_type == "Borough") %>%
group_by(year, geography) %>%
select(year, geography, age_adjusted_rate_per_10_000, estimated_annual_rate_per_10_000, number)
## Rows: 584 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): GeoType, Geography, Age-adjusted rate per 10,000, Estimated annual ...
## dbl (3): Time, GeoID, GeoRank
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Merge datasets by year and borough:
asthma_rat_df <- left_join(rat_df, asthma_df, by = c('year' = 'year', 'borough' = 'geography'))
Map sightings by coordinate:
Heat map w/density
rat_df %>%
plot_ly(
type = 'densitymapbox',
lat = ~latitude,
lon = ~longitude,
coloraxis = 'coloraxis',
radius = 10,
color = ~borough) %>%
layout(
mapbox = list(
style = "stamen-terrain",
zoom = 9,
center = list(lon = -73.9, lat = 40.7)),
coloraxis = list(colorscale = "Viridis"),
title = ("Density Map of Rat Sightings"))
Plot of sightings over time in NYC overall:
overall_rat_line =
rat_df %>%
group_by(year) %>%
count() %>%
summarise(n_obs = n) %>%
ggplot(aes(x = year, y = n_obs)) +
scale_x_continuous(breaks = seq(2010, 2022, 2)) +
geom_line() +
labs(
title = "Rat Sightings Over Time in NYC",
x = "Year",
y = "Number of Sightings")
overall_rat_line

Plot of sightings over time in NYC by borough:
rat_line = rat_df %>%
group_by(borough, year) %>%
count() %>%
summarise(n_obs = n) %>%
ggplot(aes(x = year, y = n_obs , color = borough )) +
geom_line() +
scale_x_continuous(breaks = seq(2010, 2022, by = 1)) +
labs(
title = "Rat Sightings Over Time by Borough",
x = "Year",
y = "Number of Sightings")
## `summarise()` has grouped output by 'borough'. You can override using the
## `.groups` argument.
rat_line

Bar graph of all sightings by borough:
rat_bar =
rat_df %>%
count(borough) %>%
mutate(borough = fct_reorder(borough, n)) %>%
ggplot(aes(x = borough, y = n, fill = borough)) +
geom_bar(stat = "identity") +
labs(
title = "Frequency of Rat Sightings by Borough (2010-2022)",
x = "Borough",
y = "Number of Sightings",
fill = "Borough")
#plot_ly(x = ~borough, y = ~n, color = ~borough, type = "bar", colors = "viridis")
rat_bar

Plot of frequency of sightings by borough and year:
rat_bar_time =
rat_df %>%
group_by(borough) %>%
count(year) %>%
ggplot(aes(x = year, y = n, fill = borough)) +
geom_bar(stat = "identity",
position = "dodge") +
scale_x_continuous(breaks = seq(2010, 2022, by = 1)) +
labs(
title = "Frequency of Sightings in Boroughs Over Time",
x = "Year",
y = "Number of Sightings",
fill = "Borough")
rat_bar_time

Location type:
location_bar =
rat_df %>%
count(location_type) %>%
mutate(
location_type = fct_reorder(location_type, n),
ranking = min_rank(desc(n))) %>%
filter(ranking <= 10) %>%
arrange(n) %>%
ggplot(aes(x = location_type, y = n, fill = location_type)) +
geom_bar(stat = "identity") +
labs(title = "Top 10 Location Types",
x = "Location Type",
y = "Number of Sightings",
fill = "location_type") + coord_flip() +
theme(legend.position = "none")
location_bar

location_grid =
rat_df %>%
group_by(borough, location_type) %>%
count() %>%
summarise(n_obs = n) %>%
mutate(
location_type = fct_reorder(location_type, n_obs),
ranking = min_rank(desc(n_obs))) %>%
filter(ranking <= 3) %>%
arrange(ranking) %>%
ggplot(aes(x = location_type, y = n_obs, fill = location_type)) +
geom_bar(stat = "identity") +
facet_grid(. ~ borough) +
labs(title = "Top Sighting Locations by Borough",
x = "Location Type",
y = "Number of Sightings",
fill = "location_type") +
theme(axis.text.x = element_blank())
## `summarise()` has grouped output by 'borough'. You can override using the
## `.groups` argument.
#theme(axis.text.x = element_text(angle = 90, hjust = 1))
location_grid

Rat asthma plot:
asthma_line =
asthma_rat_df %>%
filter(year > 2009) %>%
mutate(age_adjusted_rate_per_10_000 = as.numeric(age_adjusted_rate_per_10_000, na.rm = TRUE)) %>%
group_by(year) %>%
summarize(mean_age_adjust = mean(age_adjusted_rate_per_10_000)) %>%
drop_na(mean_age_adjust) %>%
ggplot(aes(x = year, y = mean_age_adjust)) +
geom_line() +
labs(
title = "Asthma ED Visits Over Time in NYC",
x = "Year",
y = "Mean age adjusted rate per 10,000")
asthma_line + overall_rat_line

asthma_num_df =
asthma_df %>%
mutate(
number = as.numeric(number),
data = "Total asthma ER visits"
) %>%
group_by(year, data) %>%
summarise(n_obs = sum(number, na.rm = TRUE))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
asthma_rate_df =
asthma_df %>%
mutate(
age_adjusted_rate_per_10_000 = as.numeric(age_adjusted_rate_per_10_000),
data = "Age adjusted asthma ER visit rate"
) %>%
group_by(year, data) %>%
summarise(n_obs = mean(age_adjusted_rate_per_10_000, na.rm = TRUE))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
asthma_estimated_df =
asthma_df %>%
mutate(
estimated_annual_rate_per_10_000 = as.numeric(estimated_annual_rate_per_10_000),
data = "Estimated annual asthma rate"
) %>%
group_by(year, data) %>%
summarise(n_obs = mean(estimated_annual_rate_per_10_000, na.rm = TRUE))
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
asthma_years_num_plot =
asthma_num_df %>%
ggplot(aes(x = year, y = n_obs)) +
geom_line() +
scale_x_continuous(breaks = seq(2005, 2018, 1)) +
labs(title = "Total number of asthma ER visits per year",
x = "Year",
y = "Total number of asthma ER visits")
asthma_rate_estimated_plot =
ggplot(data = asthma_rate_df, aes(x = year, y = n_obs, color = data)) +
geom_line() +
geom_line(data = asthma_estimated_df) +
scale_x_continuous(breaks = seq(2005, 2018, 1)) +
labs(title = "Age adjusted vs. Estimated Asthma Rates Over The Years",
x = "Year",
y = "Rate per 10,000 people")
asthma_years_num_plot

asthma_rate_estimated_plot

asthma_rate_borough_plot =
asthma_df %>%
group_by(geography) %>%
mutate(
age_adjusted_rate_per_10_000 = as.numeric(age_adjusted_rate_per_10_000),
geography = as.factor(geography)
) %>%
summarise(
asthma_rate = mean(age_adjusted_rate_per_10_000, na.rm = TRUE)
) %>%
mutate(geography = fct_reorder(geography, asthma_rate)) %>%
ggplot(aes(x = geography, y = asthma_rate, fill = geography)) +
geom_bar(stat = "identity") +
labs(
title = "Mean asthma rate per borough (2004-2018)",
x = "Borough",
y = "Mean Age-Adjusted Asthma Rate per 10,000",
fill = "Borough")
asthma_rate_borough_plot

asthma_borough_time_plot =
asthma_df %>%
group_by(geography, year) %>%
mutate(
age_adjusted_rate_per_10_000 = as.numeric(age_adjusted_rate_per_10_000),
geography = as.factor(geography)
) %>%
summarise(
asthma_rate = mean(age_adjusted_rate_per_10_000, na.rm = TRUE)
) %>%
ggplot(aes(x = year, y = asthma_rate, fill = geography)) +
geom_bar(stat = "identity",
position = "dodge") +
scale_x_continuous(breaks = seq(2005, 2018, by = 1)) +
labs(
title = "Mean Asthma ER Visits in Boroughs Over Time",
x = "Year",
y = "Mean Asthma ER Visits per 10,000",
fill = "Borough")
## `summarise()` has grouped output by 'geography'. You can override using the
## `.groups` argument.
asthma_borough_time_plot

Statistical Analysis
Question: Do asthma rates differ based on rat sightings?
Statistical modeling plan: Run a regression model, such that: outcome variable = asthma rate, predictor variable = rat sightings, control for = year
rat_df2 = rat_df %>%
group_by(year) %>%
count() %>%
summarise(
n_rat_sightings = n
)
asthma_df2 = asthma_df %>%
mutate(
age_adjusted_rate_per_10_000 = as.numeric(age_adjusted_rate_per_10_000)
) %>%
group_by(year) %>%
summarise(
mean_asthma_rate = mean(age_adjusted_rate_per_10_000, na.rm = TRUE)
)
asthma_rat_df2 = left_join(rat_df2, asthma_df2, by = 'year')
asthma_rat_mod = lm( mean_asthma_rate ~ n_rat_sightings + year, data = asthma_rat_df2)
asthma_rat_mod %>%
broom::tidy()
## # A tibble: 3 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -3425. 4643. -0.738 0.502
## 2 n_rat_sightings -0.00425 0.00207 -2.05 0.109
## 3 year 1.78 2.32 0.769 0.485
Check linear model assumptions
# 1. Linear relationship
plot(asthma_rat_mod, 1)

# 2. Normality of residuals
hist(asthma_rat_mod$residuals)

plot(asthma_rat_mod, 2)

# 3. Testing the homoscedasticity assumption
# Plotting residuals
plot(asthma_rat_mod, 3)

# Breusch-Pagan test
lmtest::bptest(asthma_rat_mod)
##
## studentized Breusch-Pagan test
##
## data: asthma_rat_mod
## BP = 3.6938, df = 2, p-value = 0.1577